from bs4 import BeautifulSoup
from books.book_spider import BookSpider
from utils import replace_filename_invalid_chars


class TangsanshuLaSpider(BookSpider):
    '''唐三书屋小说爬虫'''

    def _get_links(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        dds = soup.find('div', class_='listmain').find_all('dd')[12:]
        _dict = dict()
        for dd in dds:
            a = dd.find('a')
            title = a.text
            if title.find('章 ') == -1:
                title = title.replace('章', '章 ')
            title = replace_filename_invalid_chars(title)
            url = self._domain + a['href']
            _dict[title] = url
        return _dict

    def _remove_content_invalid_chars(self, content):
        import re
        content = content.replace('\xa0', ' ')
        content = content.replace('    ', '')
        content = content.replace('  ', '')
        content = content.replace(' ', '')
        content = re.sub('http:.*网址:m.tangsanshu.la', '', content)
        return content
